In [1]:
# Python libraries
# Classic,data manipulation and linear algebra
import pandas as pd
import numpy as np

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
In [2]:
path = 'dataset/insurance.csv'
data = pd.read_csv(path)
data.head()
Out[2]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
In [3]:
data.tail().T
Out[3]:
1333 1334 1335 1336 1337
age 50 18 18 21 61
sex male female female female female
bmi 30.97 31.92 36.85 25.8 29.07
children 3 0 0 0 0
smoker no no no no yes
region northwest northeast southeast southwest northwest
charges 10600.5483 2205.9808 1629.8335 2007.945 29141.3603
In [4]:
data.drop('region', axis=1, inplace=True)
In [5]:
data.isnull().sum()
Out[5]:
age         0
sex         0
bmi         0
children    0
smoker      0
charges     0
dtype: int64
In [6]:
data.shape
Out[6]:
(1338, 6)
In [7]:
data.describe()
Out[7]:
age bmi children charges
count 1338.000000 1338.000000 1338.000000 1338.000000
mean 39.207025 30.663397 1.094918 13270.422265
std 14.049960 6.098187 1.205493 12110.011237
min 18.000000 15.960000 0.000000 1121.873900
25% 27.000000 26.296250 0.000000 4740.287150
50% 39.000000 30.400000 1.000000 9382.033000
75% 51.000000 34.693750 2.000000 16639.912515
max 64.000000 53.130000 5.000000 63770.428010
In [8]:
import plotly.express as px
for i in data.columns:
    fig = px.histogram(data, x=i)
    fig.update_layout(bargap=0.2,width=500, height=300)
    fig.show()
In [9]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 62.8+ KB
In [10]:
data["sex"].unique()
Out[10]:
array(['female', 'male'], dtype=object)
In [11]:
data["smoker"].unique()
Out[11]:
array(['yes', 'no'], dtype=object)
In [12]:
data.to_csv("preprocessed.csv")
In [13]:
encoding = {"sex": {"male": 1, "female": 0},"smoker": {"yes": 1, "no": 0}}
df= data.replace(encoding)
df.head()
Out[13]:
age sex bmi children smoker charges
0 19 0 27.900 0 1 16884.92400
1 18 1 33.770 1 0 1725.55230
2 28 1 33.000 3 0 4449.46200
3 33 1 22.705 0 0 21984.47061
4 32 1 28.880 0 0 3866.85520
In [14]:
import plotly.express as px
fig = px.scatter_matrix(df,dimensions=df.columns,
    color="charges")
fig.update_layout(
    title='Scatterplot Matrix',
    dragmode='select',
    width=800,
    height=800,
    hovermode='closest',
)
fig.show()
In [15]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   charges   1338 non-null   float64
dtypes: float64(2), int64(4)
memory usage: 62.8 KB
In [16]:
# Correlation matrix
corrmat = df.corr()
fig = go.Figure(data = go.Heatmap( z = corrmat.values, x = list(corrmat.columns),y = list(corrmat.index),colorscale = 'Viridis'))
fig.update_layout(title = 'Correlation',width=500,height=500)
fig.show()
In [17]:
df.corr()['charges'].sort_values()
Out[17]:
sex         0.057292
children    0.067998
bmi         0.198341
age         0.299008
smoker      0.787251
charges     1.000000
Name: charges, dtype: float64
In [18]:
sns.catplot(x="smoker", kind="count",hue = 'sex',palette = 'magma', data=data)
Out[18]:
<seaborn.axisgrid.FacetGrid at 0x19115795be0>
In [19]:
sns.catplot(x="sex", y="charges", hue="smoker",kind="violin", data=data, palette = 'magma')
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x191119d2c40>
In [20]:
plt.figure(figsize=(12,5))
plt.title("Box plot for charges of women")
sns.boxplot(y="smoker", x="charges", data =  df[(df.sex == 0)] , orient="h", palette = 'rainbow')
Out[20]:
<AxesSubplot:title={'center':'Box plot for charges of women'}, xlabel='charges', ylabel='smoker'>
In [21]:
plt.figure(figsize=(12,5))
plt.title("Box plot for charges of men")
sns.boxplot(y="smoker", x="charges", data =  df[(df.sex == 1)] , orient="h", palette = 'rainbow')
Out[21]:
<AxesSubplot:title={'center':'Box plot for charges of men'}, xlabel='charges', ylabel='smoker'>
In [22]:
#"Distribution of age"
sns.displot(data["age"], color = 'g')
Out[22]:
<seaborn.axisgrid.FacetGrid at 0x19117dc0070>
In [23]:
sns.catplot(x="smoker", kind="count",hue = 'sex', palette="rainbow", data=data[(data.age == 18)])
plt.title("The number of smokers and non-smokers (18 years old)")
Out[23]:
Text(0.5, 1.0, 'The number of smokers and non-smokers (18 years old)')
In [24]:
plt.figure(figsize=(12,5))
plt.title("Box plot for charges 18 years old smokers")
sns.boxplot(y="smoker", x="charges", data = data[(data.age == 18)] , orient="h", palette = 'pink')
Out[24]:
<AxesSubplot:title={'center':'Box plot for charges 18 years old smokers'}, xlabel='charges', ylabel='smoker'>
In [25]:
g = sns.jointplot(x="age", y="charges", data = df[(df.smoker == 0)],kind="kde", color="m")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
#'Distribution of charges and age for non-smokers'
Out[25]:
<seaborn.axisgrid.JointGrid at 0x19117faddf0>
In [26]:
g = sns.jointplot(x="age", y="charges", data = df[(df.smoker == 1)],kind="kde", color="c")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
#'Distribution of charges and age for smokers'
Out[26]:
<seaborn.axisgrid.JointGrid at 0x191191392b0>
In [27]:
sns.lmplot(x="age", y="charges", hue="smoker", data=data, palette = 'inferno_r', height = 7)
#Smokers and non-smokers'
Out[27]:
<seaborn.axisgrid.FacetGrid at 0x19117f5a850>
In [28]:
#"Distribution of bmi"
ax = sns.displot(data["bmi"], color = 'm')
In [29]:
#"Distribution of charges for patients with BMI greater than 30"
sns.displot(data[(data.bmi >= 30)]['charges'], color = 'm')
Out[29]:
<seaborn.axisgrid.FacetGrid at 0x1911933ca00>
In [30]:
#"Distribution of charges for patients with BMI less than 30"
sns.displot(data[(data.bmi < 30)]['charges'], color = 'b')
Out[30]:
<seaborn.axisgrid.FacetGrid at 0x191192dc400>
In [31]:
g = sns.jointplot(x="bmi", y="charges", data = data,kind="kde", color="r")
g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$X$", "$Y$")
#ax.set_title('Distribution of bmi and charges')
Out[31]:
<seaborn.axisgrid.JointGrid at 0x1911937bd90>
In [32]:
plt.figure(figsize=(10,6))
ax = sns.scatterplot(x='bmi',y='charges',data=data,palette='magma',hue='smoker')
ax.set_title('Scatter plot of charges and bmi')

sns.lmplot(x="bmi", y="charges", hue="smoker", data=data, palette = 'magma', height = 8)
Out[32]:
<seaborn.axisgrid.FacetGrid at 0x191192d2af0>
In [33]:
sns.catplot(x="children", kind="count", palette="ch:.25", data=data, height = 6)
Out[33]:
<seaborn.axisgrid.FacetGrid at 0x19119779a90>
In [34]:
sns.catplot(x="smoker", kind="count", palette="rainbow",hue = "sex",
            data=data[(data.children > 0)], height = 6)
#'Smokers and non-smokers who have childrens'
Out[34]:
<seaborn.axisgrid.FacetGrid at 0x191193cb490>
In [35]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
In [36]:
X = df.drop(['charges'], axis = 1)
Y = df.charges

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=101)
In [37]:
print(X_train.shape)
print(X_test.shape)
(1070, 5)
(268, 5)
In [38]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,Y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
In [39]:
lr_train_acc = lr.score(X_train,Y_train)
print("Train Accuracy: "+ str(lr_train_acc))
lr_acc= lr.score(X_test,Y_test)
print("Test Accuracy: "+ str(lr_acc))
Train Accuracy: 0.7461111742852395
Test Accuracy: 0.7618254042736423
In [40]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 1000,criterion = 'squared_error',random_state = 100,n_jobs = -1,max_depth=5)
forest.fit(X_train,Y_train)
forest_train_pred = forest.predict(X_train)
forest_test_pred = forest.predict(X_test)

rfc_train_acc = forest.score(X_train,Y_train)
print("Train Accuracy: "+ str(rfc_train_acc))
rfc_acc= forest.score(X_test,Y_test)
print("Test Accuracy: "+ str(rfc_acc))
Train Accuracy: 0.8891494245729069
Test Accuracy: 0.8629064897819936
In [41]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(criterion = 'squared_error',max_depth=5)
dtr.fit(X_train,Y_train)
# Predict
dtr_train_pred = dtr.predict(X_train)
dtr_test_pred = dtr.predict(X_test)

dtr_train_acc = dtr.score(X_train,Y_train)
print("Train Accuracy: "+ str(dtr_train_acc))
dtr_acc= dtr.score(X_test,Y_test)
print("Test Accuracy: "+ str(dtr_acc))
Train Accuracy: 0.8802943823311056
Test Accuracy: 0.8618463447218458
In [42]:
from sklearn import linear_model
# Fit regression model
lassoReg = linear_model.Lasso(alpha=0.1)
lassoReg.fit(X_train,Y_train)
# Predict
lassoReg_train_pred = lassoReg.predict(X_train)
lassoReg_test_pred =lassoReg.predict(X_test)

lassoReg_train_acc = lassoReg.score(X_train,Y_train)
print("Train Accuracy: "+ str(lassoReg_train_acc))
lassoReg_acc= lassoReg.score(X_test,Y_test)
print("Test Accuracy: "+ str(lassoReg_acc))
Train Accuracy: 0.7461111736377029
Test Accuracy: 0.7618255389520554
In [43]:
plt.figure(figsize=(10,6))
plt.scatter(forest_train_pred,forest_train_pred - Y_train,
          c = 'black', marker = 'o', s = 35, alpha = 0.5,
          label = 'Train data')
plt.scatter(forest_test_pred,forest_test_pred - Y_test,
          c = 'c', marker = 'o', s = 35, alpha = 0.7,
          label = 'Test data')
plt.xlabel('Predicted values')
plt.ylabel('Tailings')
plt.legend(loc = 'upper left')
plt.hlines(y = 0, xmin = 0, xmax = 60000, lw = 2, color = 'red')
plt.show()
In [44]:
models = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest Regression','Decision Tree Regression', 'Lasso Regression'],
    'Score': [lr_acc,rfc_acc,dtr_acc,lassoReg_acc]
})
models.sort_values(by = 'Score', ascending = False)
Out[44]:
Model Score
1 Random Forest Regression 0.862906
2 Decision Tree Regression 0.861846
3 Lasso Regression 0.761826
0 Linear Regression 0.761825
In [45]:
models.to_csv(r'models2.csv',index=False)
In [46]:
fig = px.bar(models, x='Model', y='Score')
fig.update_layout(width=500,height=500)
fig.show()
In [47]:
# pickling the model 
import pickle 
import warnings
warnings.filterwarnings('ignore')
pickle_out = open("result.pkl", "wb") 
pickle.dump(forest, pickle_out) 
pickle_out.close()
In [48]:
pickle_in = open('result.pkl', 'rb')
rfr = pickle.load(pickle_in)
In [49]:
prediction = rfr.predict([[28,1,33,3,0]])
print(prediction)
[6332.10105162]
In [50]:
result = prediction * (75.62)
res = result[0]
round(res,2)
Out[50]:
478833.48